PART ONE : Exploration
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(ggridges)
df<- readr::read_csv("cs_1675_fall2021_finalproject.csv", col_names = TRUE)
## Rows: 1252 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): m
## dbl (10): x1, x2, x3, x4, v1, v2, v3, v4, v5, output
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- df %>%
mutate(outcome = ifelse(output < 0.33, 'event', 'non_event'),
outcome = factor(outcome, levels = c("event", "non_event"))) %>%
mutate(y = boot::logit(output)) %>% glimpse()
## Rows: 1,252
## Columns: 13
## $ x1 <dbl> 0.025878, 0.030768, 0.019325, 0.306212, 0.031296, 0.031073, 0.…
## $ x2 <dbl> 0.255934, 0.261575, 0.020877, 0.033379, 0.259342, 0.027119, 0.…
## $ x3 <dbl> 0.492830, 0.498460, 0.258360, 0.255385, 0.264387, 0.260915, 0.…
## $ x4 <dbl> 0.012770, 0.055779, 0.012424, 0.056190, 0.056594, 0.055192, 0.…
## $ v1 <dbl> 0.275651, 0.343204, 4.998508, 5.090153, 5.031107, 9.977407, 0.…
## $ v2 <dbl> 0.033657, 0.027082, 0.030259, 0.052342, 0.517705, 0.532436, 1.…
## $ v3 <dbl> 1.166214, 1.260579, 1.298285, 1.322005, 1.368195, 1.298797, 1.…
## $ v4 <dbl> 0.408402, 0.664248, 0.412870, 0.652111, 0.533701, 0.857509, 0.…
## $ v5 <dbl> 0.525226, 2.866343, 0.409007, 0.861594, 6.451933, 0.958574, 0.…
## $ m <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A…
## $ output <dbl> 0.786, 0.730, 0.996, 0.326, 0.735, 0.954, 0.969, 0.986, 0.874,…
## $ outcome <fct> non_event, non_event, non_event, event, non_event, non_event, …
## $ y <dbl> 1.3009808, 0.9946226, 5.5174529, -0.7263327, 1.0201407, 3.0320…
derived_df <- df %>%
mutate(x5 = 1 - (x1 + x2 + x3 + x4),
w = x2 / (x3 + x4),
z = (x1 + x2) / (x5 + x4),
t = v1 * v2)
Base features:
> Chemistry variables: x1:x4
> Manufacturing process variables: v1:v5, m
Derived features: x5, w, z, and t
> x5 = 1 - (x1 + x2 + x3 + x4)
> w = x2 / (x3 + x4)
> z = (x1 + x2) / (x4 + x5)
> t=v1v2
Output: output*
Logit-transformed response: y = boot::logit(output)
visdat::vis_dat(df) #checking dataypes per feature
df %>% subset(select = -output)%>% purrr::map_dbl(n_distinct) #distinct values per feature
## x1 x2 x3 x4 v1 v2 v3 v4 v5 m
## 1245 1250 1250 1235 1252 1249 1252 1252 1252 5
## outcome y
## 2 690
All base features have roughly the same number of observations, with the exception of m. The machine (m) input has relatively few unique values. Visually checking the counts associated with each unique value for m:
#df %>% count(m) #counts
df%>%
subset(select = -output)%>%
ggplot(mapping=aes(m)) +
geom_bar() +
theme_bw()
df %>%
select(starts_with('x'), m) %>%
tibble::rowid_to_column() %>%
pivot_longer(starts_with('x'))%>%
ggplot(mapping = aes(x= value)) +
geom_histogram(bins = 21) +
facet_wrap(~name, scales = 'free_x')
df %>%
select(starts_with('v'), m) %>%
tibble::rowid_to_column() %>%
pivot_longer(starts_with('v'))%>%
ggplot(mapping = aes(x= value)) +
geom_histogram(bins = 21) +
facet_wrap(~name, scales = 'free_x')
derived_df %>%
select(c(x5,t,w,z)) %>%
tibble::rowid_to_column() %>%
pivot_longer(c(x5,t,w,z))%>%
ggplot(mapping = aes(x= value)) +
geom_histogram(bins = 21) +
facet_wrap(~name, scales = 'free_x')
### Distribution of the responses:
derived_df %>%
select(c(y,output)) %>%
tibble::rowid_to_column() %>%
pivot_longer(c(y,output))%>%
ggplot(mapping = aes(x= value)) +
geom_histogram(bins = 21) +
facet_wrap(~name, scales = 'free_x')
The logit transformed response pushes the output to resemble a normal distribution.
derived_df %>%
select(c(x5,t,w,z), m) %>%
tibble::rowid_to_column() %>%
pivot_longer(c(x5,t,w,z))%>%
ggplot(mapping = aes(x= value)) +
geom_freqpoly(mapping = aes(color = m, y = stat(density))) +
facet_wrap(~name, scales = 'free')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
derived_df %>%
select(c(output,y), m) %>%
tibble::rowid_to_column() %>%
pivot_longer(c(output,y))%>%
ggplot(mapping = aes(x= value)) +
geom_freqpoly(mapping = aes(color = m, y = stat(density))) +
facet_wrap(~name, scales = 'free')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
There seems to be some variation on the output extremes (approaching 1.0) per machine. However, the logic transformed response, y, shows no significant variation.
df %>%
select(starts_with('x'), m) %>%
tibble::rowid_to_column() %>%
pivot_longer(starts_with('x'))%>%
ggplot(mapping = aes(x= value)) +
geom_freqpoly(mapping = aes(color = m, y = stat(density))) +
facet_wrap(~name, scales = 'free')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
df %>%
select(starts_with('v'), m) %>%
tibble::rowid_to_column() %>%
pivot_longer(starts_with('v'))%>%
ggplot(mapping = aes(x= value)) +
geom_freqpoly(mapping = aes(color = m, y = stat(density))) +
facet_wrap(~name, scales = 'free')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
No feature varies significantly according to ‘m’.
df %>%
select(starts_with('x'), m) %>%
tibble::rowid_to_column() %>%
pivot_longer(starts_with('x'))%>%
ggplot(mapping = aes(x= m)) +
geom_boxplot(mapping = aes( y = value))+
stat_summary(fun.data = 'mean_se', mapping = aes(y = value), fun.args = list(mult = 2))+
facet_wrap(~name, scales = 'free')
The distributions of each chemistry & manufacturing process feature appear roughly uniform across machines; machine doesn’t appear to impact any single variable in particular.
derived_df %>%
select(c(x5,t,w,z), m) %>%
tibble::rowid_to_column() %>%
pivot_longer(c(x5,t,w,z))%>%
ggplot(mapping = aes(x= m)) +
geom_boxplot(mapping = aes( y = value))+
stat_summary(fun.data = 'mean_se', mapping = aes(y = value), fun.args = list(mult = 2))+
facet_wrap(~name, scales = 'free')
Features ‘t’ and ‘x5’ appear to contain high value outliers. However, machine still does not appear to affect any derived feature.
derived_df %>%
select(-m, -output, -outcome) %>%
cor() %>%
corrplot::corrplot(method = 'number',type = 'upper')
No input has significant correlation (>0.85). However, features x5 and z seem to have a moderately negative correlation with respect to each other.
df %>%
select(starts_with('x'), -x4, y) %>%
tibble::rowid_to_column() %>%
pivot_longer(!c("rowid", "y")) %>%
ggplot(mapping = aes(x = value, y = y)) +
geom_point(alpha = 0.5) +
facet_wrap(~name) +
theme_bw()
df %>%
select(starts_with('x'), -x4, output) %>%
tibble::rowid_to_column() %>%
pivot_longer(!c("rowid", "output")) %>%
ggplot(mapping = aes(x = value, y = output)) +
geom_point(alpha = 0.5) +
facet_wrap(~name) +
theme_bw()
df %>%
select(x4, y) %>%
tibble::rowid_to_column() %>%
pivot_longer(!c("rowid", "y")) %>%
ggplot(mapping = aes(x = value, y = y)) +
geom_point(alpha = 0.5) +
facet_wrap(~name) +
theme_bw()
df %>%
select(x4, output) %>%
tibble::rowid_to_column() %>%
pivot_longer(!c("rowid", "output")) %>%
ggplot(mapping = aes(x = value, y = output)) +
geom_point(alpha = 0.5) +
facet_wrap(~name) +
theme_bw()
df %>%
select(v1,v3,v5, y) %>%
tibble::rowid_to_column() %>%
pivot_longer(!c("rowid", "y")) %>%
ggplot(mapping = aes(x = value, y = y)) +
geom_point(alpha = 0.5) +
facet_wrap(~name) +
theme_bw()
df %>%
select(v2,v4, y) %>%
tibble::rowid_to_column() %>%
pivot_longer(!c("rowid", "y")) %>%
ggplot(mapping = aes(x = value, y = y)) +
geom_point(alpha = 0.5) +
facet_wrap(~name) +
theme_bw()
df %>%
select(v1,v3,v5, output) %>%
tibble::rowid_to_column() %>%
pivot_longer(!c("rowid", "output")) %>%
ggplot(mapping = aes(x = value, y = output)) +
geom_point(alpha = 0.5) +
facet_wrap(~name) +
theme_bw()
df %>%
select(v2,v4, output) %>%
tibble::rowid_to_column() %>%
pivot_longer(!c("rowid", "output")) %>%
ggplot(mapping = aes(x = value, y = output)) +
geom_point(alpha = 0.5) +
facet_wrap(~name) +
theme_bw()
There seems to be little to no trend with respect to ‘y’ and inputs ‘v’ and ‘x4’. There seem to be some relationship between certain values of ‘x1’, ‘x2’, and ‘x3’ that result in a minimized ‘y’.
derived_df %>%
select(t,z, y) %>%
tibble::rowid_to_column() %>%
pivot_longer(!c("rowid", "y")) %>%
ggplot(mapping = aes(x = value, y = y)) +
geom_point(alpha = 0.5) +
facet_wrap(~name) +
theme_bw()
derived_df %>%
select(t,z, output) %>%
tibble::rowid_to_column() %>%
pivot_longer(!c("rowid", "output")) %>%
ggplot(mapping = aes(x = value, y = output)) +
geom_point(alpha = 0.5) +
facet_wrap(~name) +
theme_bw()
derived_df %>%
select(w,x5, y) %>%
tibble::rowid_to_column() %>%
pivot_longer(!c("rowid", "y")) %>%
ggplot(mapping = aes(x = value, y = y)) +
geom_point(alpha = 0.5) +
facet_wrap(~name) +
theme_bw()
derived_df %>%
select(w,x5, output) %>%
tibble::rowid_to_column() %>%
pivot_longer(!c("rowid", "output")) %>%
ggplot(mapping = aes(x = value, y = output)) +
geom_point(alpha = 0.5) +
facet_wrap(~name) +
theme_bw()
It does nto appear that there is a relationship between values of ‘t’ that result in a minimized ‘y’. There may be a relationship between certain values of ‘w’, ‘x5’, and ‘z’ that result in a minimized ‘y’.
derived_df %>%
ggplot(mapping = aes(x = outcome)) +
geom_bar() +
geom_text(stat = 'count',
mapping = aes(label = stat(count)),
color = 'red',
nudge_y = 7,
size = 5.5) +
theme_bw()
Although class “event” occurs at a rate of ~50% of class “non event”, the imbalance is not substantial; subsampling should not be required.
derived_df %>%
select(starts_with('x'),outcome) %>%
tibble::rowid_to_column() %>%
pivot_longer(!c("rowid", "outcome")) %>%
mutate(input_id = as.integer( stringr::str_extract(name, "\\d+") )) %>%
mutate(input_bin = cut(input_id,
breaks = quantile(input_id),
include.lowest = TRUE)) %>%
filter(input_id < 11) %>%
ggplot(mapping = aes(x = value, y = as.factor(input_id))) +
geom_density_ridges(mapping = aes(fill = outcome),
alpha = 0.5) +
ggthemes::scale_fill_colorblind() +
theme_bw() +
theme(strip.text = element_blank())
## Picking joint bandwidth of 0.0262
derived_df %>%
select(starts_with('v'),outcome) %>%
tibble::rowid_to_column() %>%
pivot_longer(!c("rowid", "outcome")) %>%
mutate(input_id = as.integer( stringr::str_extract(name, "\\d+") )) %>%
mutate(input_bin = cut(input_id,
breaks = quantile(input_id),
include.lowest = TRUE)) %>%
filter(input_id < 11) %>%
ggplot(mapping = aes(x = value, y = as.factor(input_id))) +
geom_density_ridges(mapping = aes(fill = outcome),
alpha = 0.5) +
#facet_wrap(~input_bin, scales = "free_y") +
ggthemes::scale_fill_colorblind() +
theme_bw() +
theme(strip.text = element_blank())
## Picking joint bandwidth of 0.446